ggplotWe load the cars dataset and plot it.
data(cars)
help(cars)
head(cars)
## speed dist
## 1 4 2
## 2 4 10
## 3 7 4
## 4 7 22
## 5 8 16
## 6 9 10
plot(cars)
Load the ggplot library for observing how it improves the R’s basic charts:
# install.packages("ggplot2") # Install if it doesn't already be installed
library(ggplot2)
ggplot(cars, aes(x=speed, y=dist)) + # It loads the data and set the variables
geom_point(shape=1) # It sets the points like hollow circles
ggplot(cars, aes(x=speed, y=dist)) +
geom_point(shape=1) +
geom_smooth(method=lm) # It adds a regression line with a confidence interval of 95%
ggplot(cars, aes(x=speed, y=dist)) +
geom_point(shape=1) +
geom_smooth(method=lm, se=FALSE) # It adds a regression line without a confidence interval
ggplot(cars, aes(x=speed, y=dist)) +
geom_point(shape=1) +
geom_smooth() # It adds a smooth line above data and a confidence interval of 95%
## `geom_smooth()` using method = 'loess'
More scatterplots
# Load other data
data(diamonds)
help(diamonds)
head(diamonds)
## # A tibble: 6 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
str(diamonds)
## Classes 'tbl_df', 'tbl' and 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
ggplot(diamonds, aes(x=carat, y=price)) +
geom_point() # If shape isn't specified, it adds filled circles
ggplot(diamonds, aes(x=carat, y=price)) +
geom_point(shape=25, size=4) # We can specify the shape and size of points
ggplot(diamonds, aes(x=carat, y=price, colour=cut)) + # Also, we can plot by a factor
geom_point(size=1.5)
# Even, it can be plot with a gray scale by a value of continue atribute
grayScale <- ggplot(diamonds, aes(x=carat, y=price))
grayScale + geom_point(alpha=1)
grayScale + geom_point(alpha=0.8)
grayScale + geom_point(alpha=0.5)
grayScale + geom_point(alpha=0.3)
# Also, a scatterplot with bins can be do in coloured rectangles
bin <- ggplot(diamonds, aes(x = carat, y = price))
bin + stat_bin2d()
We load mpg dataset. We plot a histogram with the highway consumption.
data(mpg)
help(mpg)
head(mpg)
## # A tibble: 6 x 11
## manufacturer model displ year cyl trans drv cty hwy fl
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr>
## 1 audi a4 1.8 1999 4 auto(l5) f 18 29 p
## 2 audi a4 1.8 1999 4 manual(m5) f 21 29 p
## 3 audi a4 2.0 2008 4 manual(m6) f 20 31 p
## 4 audi a4 2.0 2008 4 auto(av) f 21 30 p
## 5 audi a4 2.8 1999 6 auto(l5) f 16 26 p
## 6 audi a4 2.8 1999 6 manual(m5) f 18 26 p
## # ... with 1 more variables: class <chr>
hist(mpg$hwy)
ggplot histograms with options of density curve and setting of bin’s number.
ggplot(mpg, aes(x=hwy)) +
geom_histogram(binwidth=3) # It sets the binwidth of each partition
ggplot(mpg, aes(x=hwy)) +
geom_histogram(bins=30) # It sets the number of partitions
ggplot(mpg, aes(x=hwy)) +
geom_histogram(binwidth=0.5, colour="red", fill="green") # Colour's options
ggplot(mpg, aes(x=hwy)) +
geom_density(fill="yellow") # Plot a density curve
# Histogram with a density curve
ggplot(mpg, aes(x=hwy)) +
geom_histogram(aes(y=..density..),
binwidth=1.2,
colour="black", fill="blue") +
geom_density(alpha=0.3, fill="red")
Also, histograms can be represented with the variable divided respect of several categories. For it, we go to select two manufacturers: audi and volkswagen:
mpg_subset <- mpg[mpg$manufacturer == "audi" | mpg$manufacturer == "volkswagen", ]
str(mpg_subset)
## Classes 'tbl_df', 'tbl' and 'data.frame': 45 obs. of 11 variables:
## $ manufacturer: chr "audi" "audi" "audi" "audi" ...
## $ model : chr "a4" "a4" "a4" "a4" ...
## $ displ : num 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
## $ year : int 1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
## $ cyl : int 4 4 4 4 6 6 6 4 4 4 ...
## $ trans : chr "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
## $ drv : chr "f" "f" "f" "f" ...
## $ cty : int 18 21 20 21 16 18 18 18 16 20 ...
## $ hwy : int 29 29 31 30 26 26 27 26 25 28 ...
## $ fl : chr "p" "p" "p" "p" ...
## $ class : chr "compact" "compact" "compact" "compact" ...
# Overlapped histograms
ggplot(mpg_subset, aes(x=hwy, fill=manufacturer)) + # With fill=manufacturer, we say that the histogram is filled with values of manufacturer
geom_histogram(binwidth=2, alpha=0.5, position="identity") # position="identity" -> overlap
# Interlayer histograms
ggplot(mpg_subset, aes(x=hwy, fill=manufacturer)) +
geom_histogram(binwidth=2, position="dodge") # position="dodge" -> interlayer
# Density curves
ggplot(mpg_subset, aes(x=hwy, fill=manufacturer)) +
geom_density()
ggplot(mpg_subset, aes(x=hwy, fill=manufacturer)) +
geom_density(alpha=0.4)
More histograms and options
# Histograms more sofisticated
set.seed(4235)
diamonds_small <- diamonds[sample(nrow(diamonds), 1000), ] # Get a random small subset
ggplot(diamonds_small, aes(x=price)) +
geom_histogram(binwidth=1000) # We see its histogram
ggplot(diamonds_small, aes(x=price, ..density.., colour=cut)) +
geom_freqpoly(binwidth=1000) # Show frequency polinomials with a colours by a factor (cut)
ggplot(diamonds_small, aes(x=price, fill=cut)) +
geom_histogram(binwidth=1000) # Overlapped histograms with a colours by a factor (cut)
ggplot(diamonds_small, aes(x=price, fill=cut)) +
geom_density(alpha=0.3) # Overlapped densisty curvers with a colours by a factor (cut)
We load a tips dataset to observ its distribution
# install.packages("reshape2")
library(reshape2)
data(tips)
help(tips)
head(tips)
## total_bill tip sex smoker day time size
## 1 16.99 1.01 Female No Sun Dinner 2
## 2 10.34 1.66 Male No Sun Dinner 3
## 3 21.01 3.50 Male No Sun Dinner 3
## 4 23.68 3.31 Male No Sun Dinner 2
## 5 24.59 3.61 Female No Sun Dinner 4
## 6 25.29 4.71 Male No Sun Dinner 4
# Basic bar chart of total bill respect of day time
# We can observe that at dinner, the total bill is more expensive than at lunch
ggplot(data=tips, aes(x=time, y=total_bill)) +
geom_bar(stat="identity")
# We plot the last bar chart by smoker
ggplot(data=tips, aes(x=time, y=total_bill, fill=smoker)) +
geom_bar(stat="identity")
It can be observed that sunday is when the tables pay more of mean.
# Count of clients by day
ggplot(data=tips, aes(x=day)) +
geom_bar(stat="count")
# Mean count of bills by day
ggplot(data=tips, aes(x=day, y=total_bill)) +
geom_bar(stat = "summary", fun.y = "mean") # fun.y="mean" applies the function "mean" on Y axe
This charts are same bar charts, but with lines which connects points. We load a dataset of tooth growth on guinea pings.
data(ToothGrowth)
help(ToothGrowth)
head(ToothGrowth)
## len supp dose
## 1 4.2 VC 0.5
## 2 11.5 VC 0.5
## 3 7.3 VC 0.5
## 4 5.8 VC 0.5
## 5 6.4 VC 0.5
## 6 10.0 VC 0.5
tg <- ToothGrowth
We do a variables summary which allows us to realize displays with confidence margins:
# install.packages("Rmisc")
library(Rmisc)
## Loading required package: lattice
## Loading required package: plyr
tgc <- summarySE(tg, measurevar="len", groupvars=c("supp","dose"))
# measurevar="len" -> variable of measure: "len", groupvars=c("supp","dose") -> variables to group: "supp", "dose".
tgc
## supp dose N len sd se ci
## 1 OJ 0.5 10 13.23 4.459709 1.4102837 3.190283
## 2 OJ 1.0 10 22.70 3.910953 1.2367520 2.797727
## 3 OJ 2.0 10 26.06 2.655058 0.8396031 1.899314
## 4 VC 0.5 10 7.98 2.746634 0.8685620 1.964824
## 5 VC 1.0 10 16.77 2.515309 0.7954104 1.799343
## 6 VC 2.0 10 26.14 4.797731 1.5171757 3.432090
We do factor charts. We can observe the difference to apply a supplement or another supplement to pig. The vertical bars are confidence intervals of 95% of the tooth length values according to the amount of dose contributed in each supplement.
# Factor chart with error bars
ggplot(tgc, aes(x=dose, y=len, colour=supp)) +
geom_errorbar(aes(ymin=len-se, ymax=len+se), width=0.1) + # It adds a error bars with ymin and ymax like len-se (standard error) len+se
geom_line() + # It adds a line
geom_point() # It adds a points
# We observe how would be this chart according to tips by day
ggplot(tips, aes(x=day, y=tip, colour="red")) +
geom_line() +
geom_point()
# We do the boxplot chart with diamonds dataset
ggplot(diamonds, aes(cut, price)) +
geom_boxplot()
# Boxplot with the tips by day
# We can observe that the sunday they pay more of mean, but the saturday there are more people with higher tips
ggplot(tips, aes(day, tip)) +
geom_boxplot() +
coord_flip() # It flips the axes